import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from shapely.geometry import Point
from shapely import wkt
import geopandas as gpd
from tabulate import tabulate
from shapely.geometry import Point
from matplotlib.colors import LogNorm
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
from collections import CounterProject Code
Team 4 - Section 20
# We all called the main MCMF dataset something different in our explorations.
mcmf_data = pd.read_csv('My_CHI._My_Future._Programs_20241113.csv') # distribution of variables
my_chi_df = pd.read_csv('My_CHI._My_Future._Programs_20241113.csv') # Q1 exploration (Gina)
project_data = pd.read_csv('My_CHI._My_Future._Programs_20241113.csv') # Q2 exploration (Luna)
mcmf_project_data = pd.read_csv('My_CHI._My_Future._Programs_20241113.csv') # Q3 exploration (Isabella)
project_data_full = pd.read_csv('My_CHI._My_Future._Programs_20241113.csv') # Q4 exploration (Owen)
chi_nei = pd.read_csv('CommAreas_20241114.csv') # Q1, Q2
comm_areas_df = pd.read_csv('CommAreas_20241114.csv') #Q2
income_df = pd.read_csv('Census_Data.csv') #Q2
temp_data = pd.read_csv('chicago_monthly_temp_avg.csv') #Q3
bus_data_full = pd.read_csv('CTA_BusStops_20241118.csv') #Q4
train_data_full = pd.read_csv('CTA_-_System_Information_-_List_of__L__Stops_20241118.csv') #Q41 Data quality check / cleaning / preparation
1.1 Distribution of variables
By Isabella Seo
Here is the distribution of the variables used from the original MCMF dataset as well as the original data of external datasets. There are other variables that were explored, such as ones that were made, will be explored in individual analyses.
numerical_data= mcmf_data[['Capacity', 'Min Age', 'Max Age', 'Longitude', 'Latitude']]
numerical_data.describe()| Capacity | Min Age | Max Age | Longitude | Latitude | |
|---|---|---|---|---|---|
| count | 3.644920e+05 | 375108.000000 | 375108.000000 | 366508.000000 | 366508.000000 |
| mean | 4.428121e+03 | 8.763303 | 44.437189 | -87.680867 | 41.853308 |
| std | 6.162220e+05 | 6.591697 | 42.231276 | 0.126199 | 0.099811 |
| min | 0.000000e+00 | 0.000000 | 0.000000 | -120.961998 | 38.922466 |
| 25% | 1.000000e+01 | 3.000000 | 11.000000 | -87.717010 | 41.776699 |
| 50% | 1.500000e+01 | 6.000000 | 18.000000 | -87.680382 | 41.864201 |
| 75% | 2.500000e+01 | 14.000000 | 99.000000 | -87.642799 | 41.946701 |
| max | 9.910181e+07 | 65.000000 | 171.000000 | -87.530502 | 42.147499 |
#ignoring outliers
sns.boxplot(numerical_data[['Capacity', 'Min Age', 'Max Age']], showfliers = False)sns.boxplot(numerical_data['Longitude'], showfliers = False)sns.boxplot(numerical_data['Latitude'], showfliers = False)Here is the data distribution of all the categorical data we used from the MCMF dataset.
Missing values:
categorical_data = mcmf_data[['Program Price', 'Scholarship Available', 'Participants Paid', 'Transport Provided', 'Has Free Food', 'Category Name', 'Geographic Cluster Name', 'Meeting Type']]
print(categorical_data.isnull().sum())Program Price 0
Scholarship Available 0
Participants Paid 4838
Transport Provided 5360
Has Free Food 2661
Category Name 1
Geographic Cluster Name 130071
Meeting Type 0
dtype: int64
Number of levels:
categorical_data.nunique()Program Price 4
Scholarship Available 2
Participants Paid 2
Transport Provided 2
Has Free Food 2
Category Name 23
Geographic Cluster Name 90
Meeting Type 2
dtype: int64
#tabulation of categorical variables
def describe_categorical(col):
top_levels = col.value_counts().head(5)
return {
'Missing Values': col.isnull().sum(),
'Unique Values': col.nunique(),
'Top Levels': top_levels.index.tolist(),
'Frequency of Top Levels': top_levels.tolist(),
}
results = []
description = categorical_data.apply(describe_categorical)
results = pd.DataFrame(description.tolist(), index=categorical_data.columns)
print(tabulate(results, headers='keys', tablefmt='grid'))+-------------------------+------------------+-----------------+------------------------------------------------------------------------------------------------------------------------------+--------------------------------------+
| | Missing Values | Unique Values | Top Levels | Frequency of Top Levels |
+=========================+==================+=================+==============================================================================================================================+======================================+
| Program Price | 0 | 4 | ['Free', '$50 or Less', 'More Than $50', 'Unknown'] | [204544, 122619, 37131, 10814] |
+-------------------------+------------------+-----------------+------------------------------------------------------------------------------------------------------------------------------+--------------------------------------+
| Scholarship Available | 0 | 2 | [False, True] | [374584, 524] |
+-------------------------+------------------+-----------------+------------------------------------------------------------------------------------------------------------------------------+--------------------------------------+
| Participants Paid | 4838 | 2 | ['Not Paid', 'Paid, Type Unknown'] | [369071, 1199] |
+-------------------------+------------------+-----------------+------------------------------------------------------------------------------------------------------------------------------+--------------------------------------+
| Transport Provided | 5360 | 2 | [False, True] | [369511, 237] |
+-------------------------+------------------+-----------------+------------------------------------------------------------------------------------------------------------------------------+--------------------------------------+
| Has Free Food | 2661 | 2 | [False, True] | [368522, 3925] |
+-------------------------+------------------+-----------------+------------------------------------------------------------------------------------------------------------------------------+--------------------------------------+
| Category Name | 1 | 23 | ['Sports + Wellness.', 'Music & Art.', 'Reading & Writing.', 'Academic Support', 'Science'] | [167930, 112705, 36597, 12841, 9640] |
+-------------------------+------------------+-----------------+------------------------------------------------------------------------------------------------------------------------------+--------------------------------------+
| Geographic Cluster Name | 130071 | 90 | ['Northwest Equity Zone', 'West Equity Zone', 'North/Central Equity Zone', 'Far South Equity Zone', 'Southwest Equity Zone'] | [24504, 18434, 18021, 16458, 14982] |
+-------------------------+------------------+-----------------+------------------------------------------------------------------------------------------------------------------------------+--------------------------------------+
| Meeting Type | 0 | 2 | ['face_to_face', 'online'] | [363649, 11459] |
+-------------------------+------------------+-----------------+------------------------------------------------------------------------------------------------------------------------------+--------------------------------------+
Levels and distribution:
price = sns.barplot(categorical_data['Program Price'].value_counts())
price.bar_label(price.containers[0])[Text(0, 0, '204544'),
Text(0, 0, '122619'),
Text(0, 0, '37131'),
Text(0, 0, '10814')]
scholar = sns.barplot(categorical_data['Scholarship Available'].value_counts())
scholar.bar_label(scholar.containers[0])paid = sns.barplot(categorical_data['Participants Paid'].value_counts())
paid.bar_label(paid.containers[0])transport = sns.barplot(categorical_data['Transport Provided'].value_counts())
transport.bar_label(transport.containers[0])food = sns.barplot(categorical_data['Has Free Food'].value_counts())
food.bar_label(food.containers[0])# Top 5 most frequent categories
category = sns.barplot(categorical_data['Category Name'].value_counts().head(5))
category.bar_label(category.containers[0])
category.tick_params(axis = 'x',labelsize=7)#top 5 geographic clusters
cluster = sns.barplot(categorical_data['Geographic Cluster Name'].value_counts().head(5))
cluster.bar_label(cluster.containers[0])
cluster.tick_params(axis = 'x',labelsize=5)meet = sns.barplot(categorical_data['Meeting Type'].value_counts())
meet.bar_label(meet.containers[0])Distribution of Community Dataset
#For Community Dataset
# Filter to only the relevant columns used in the analysis
relevant_columns = ['COMMUNITY AREA NAME', 'HARDSHIP INDEX']
census_data_filtered = census_data_df[relevant_columns]
# Function to summarize categorical variables
def summarize_categorical(data, column):
summary = {}
summary['Variable'] = column
summary['Type'] = 'Categorical'
summary['Missing Values'] = data[column].isna().sum()
summary['Unique Values'] = data[column].nunique()
if summary['Unique Values'] > 5: # Include top 5 levels if too many
summary['Top Levels'] = data[column].value_counts().nlargest(5).to_dict()
else: # Include all levels if few
summary['Top Levels'] = data[column].value_counts().to_dict()
return summary
# Function to summarize continuous variables
def summarize_continuous(data, column):
summary = {}
summary['Variable'] = column
summary['Type'] = 'Continuous'
summary['Missing Values'] = data[column].isna().sum()
summary['Mean'] = data[column].mean()
summary['Standard Deviation'] = data[column].std()
summary['Min'] = data[column].min()
summary['Max'] = data[column].max()
summary['Median'] = data[column].median()
return summary
# Summarize relevant variables
summaries = []
for column in census_data_filtered.columns:
if census_data_filtered[column].dtype == 'object' or census_data_filtered[column].nunique() < 20: # Treat as categorical
summaries.append(summarize_categorical(census_data_filtered, column))
else: # Treat as continuous
summaries.append(summarize_continuous(census_data_filtered, column))
# Convert summaries into a DataFrame for tabular display
summary_df = pd.DataFrame(summaries)
print(tabulate(summary_df, headers='keys', tablefmt='grid'))+----+---------------------+-------------+------------------+-----------------+--------------------------------------------------------------------------------------------+----------+----------------------+-------+-------+----------+
| | Variable | Type | Missing Values | Unique Values | Top Levels | Mean | Standard Deviation | Min | Max | Median |
+====+=====================+=============+==================+=================+============================================================================================+==========+======================+=======+=======+==========+
| 0 | COMMUNITY AREA NAME | Categorical | 0 | 78 | {'Rogers Park': 1, 'Pullman': 1, 'Archer Heights': 1, 'Garfield Ridge': 1, 'Hegewisch': 1} | nan | nan | nan | nan | nan |
+----+---------------------+-------------+------------------+-----------------+--------------------------------------------------------------------------------------------+----------+----------------------+-------+-------+----------+
| 1 | HARDSHIP INDEX | Continuous | 1 | nan | nan | 49.5065 | 28.6906 | 1 | 98 | 50 |
+----+---------------------+-------------+------------------+-----------------+--------------------------------------------------------------------------------------------+----------+----------------------+-------+-------+----------+
Distribution of Temperature Data:
# plot of avg temps for each month
sns.barplot(data=month_temps)
plt.xlabel('Month')
plt.ylabel('Average Temp in Fahrenheit')Text(0, 0.5, 'Average Temp in Fahrenheit')
Distribution of Bus and Train data:
fig, axs = plt.subplots(1,2)
sns.barplot(data={'number':[len(bus_data_full['the_geom']) - bus_data_full['the_geom'].isna().sum(), bus_data_full['the_geom'].isna().sum()], 'name':['Filled', 'Missing']}, x='name', y='number', ax = axs[0])
sns.barplot(data={'number':[len(train_data_full['Location']) - train_data_full['Location'].isna().sum(), train_data_full['Location'].isna().sum()], 'name':['Filled', 'Missing']}, x='name', y='number', ax = axs[1])<Axes: xlabel='name', ylabel='number'>
sns.barplot(data=train_data_full['Line'].value_counts())<Axes: xlabel='Line', ylabel='count'>
1.2 Data cleaning
By Gina Valdivia and Luna Xu
#...Code with comments...#
# Imputing missing values ## Convert CommAreas to a GeoDataFrame, from geographic boundaries stored in 'the_geom' to use for geospatial operations
comm_areas_gdf = gpd.GeoDataFrame(comm_areas_df, geometry=gpd.GeoSeries.from_wkt(comm_areas_df['the_geom']))
# Filter rows with available Latitude and Longitude in My_CHI
my_chi_filtered = my_chi_df.dropna(subset=['Latitude', 'Longitude']) #removes rows missing Latitude or Longitude
my_chi_filtered['Latitude'] = my_chi_filtered['Latitude'].astype(float)
my_chi_filtered['Longitude'] = my_chi_filtered['Longitude'].astype(float)
my_chi_filtered['geometry'] = my_chi_filtered.apply(lambda row: Point(row['Longitude'], row['Latitude']), axis=1) #create points from the long and lat
# Convert filtered My_CHI to GeoDataFrame
my_chi_gdf = gpd.GeoDataFrame(my_chi_filtered, geometry='geometry')
# Perform a spatial join to assign community areas to each point(matches each point from my_chi to a polygon)
my_chi_with_cluster = gpd.sjoin(my_chi_gdf, comm_areas_gdf, how='left', op='intersects')
# Group by 'Program Name' and get the most common community name
geographic_cluster_mapping = my_chi_with_cluster.groupby('Program Name')['COMMUNITY'].agg(lambda x: x.mode()[0] if not x.mode().empty else None).to_dict()
# Update 'Geographic Cluster Name' in original dataframe using the computed mapping
my_chi_df['Geographic Cluster Name'] = my_chi_df.apply(
lambda row: geographic_cluster_mapping.get(row['Program Name'], row['Geographic Cluster Name']),
axis=1)# Reading Data
project_data = pd.read_csv('My_CHI._My_Future._Programs_20241113.csv')
chi_nei=pd.read_csv('CommAreas_20241114.csv')
# Exclude Online Program
project_data['Geographic Cluster Name'] = project_data.apply(
lambda row: 'online' if row['Meeting Type'] == 'online' and pd.isnull(row['Geographic Cluster Name']) else row['Geographic Cluster Name'],
axis=1
)
# Comparing Geographic Cluster Names and neighborhoodNames in Community Boundaries
project_data_unique = project_data['Geographic Cluster Name'].unique()
chi_nei_unique = chi_nei['COMMUNITY'].unique()
matches = set(project_data_unique).intersection(chi_nei_unique)
unmatched_project_data = set(project_data_unique) - matches
unmatched_chi_nei = set(chi_nei_unique) - matches
print(f"Matches: {matches}")
print(f"Unmatched in project_data: {unmatched_project_data}")
print(f"Unmatched in chi_nei: {unmatched_chi_nei}")
# Extracting Programs with No Geographic Cluster Name or Unstandardized Name
unmatched_geocluster_list = list(unmatched_project_data)
project_withlatlong=project_data.loc[
((project_data['Geographic Cluster Name'].isnull()) | (project_data['Geographic Cluster Name'].isin(unmatched_geocluster_list))) &
(project_data['Latitude'].notnull()) &
(project_data['Longitude'].notnull()),
['Program ID','Latitude','Longitude','Geographic Cluster Name']
]
# Turning data into shapely format & Mapping
project_withlatlong['point_geom']=project_withlatlong.apply(
lambda row: Point(row['Longitude'],row['Latitude']),axis=1
)
chi_nei['shapely_geom']=chi_nei['the_geom'].apply(wkt.loads)
def match_multiploygon(point,multipolygons):
for muultipolygon in multipolygons:
if muultipolygon.contains(point):
return muultipolygon
return None
project_withlatlong['shapely_geom']=project_withlatlong['point_geom'].apply(
lambda point: match_multiploygon(point,chi_nei['shapely_geom'])
)
matched_program_neiname = pd.merge(project_withlatlong,chi_nei,how='left')
# Checking Unstandardized Name & the Neighborhood They Mapped to
print(matched_program_neiname.groupby('Geographic Cluster Name')['COMMUNITY'].unique())
# Result
attempt1_result = matched_program_neiname.loc[matched_program_neiname['COMMUNITY'].notnull(),['Program ID','COMMUNITY']]
project_data = pd.merge(project_data,attempt1_result,on='Program ID',how='left')
project_data['Neighborhood'] = project_data.apply(
lambda row: row['COMMUNITY'] if pd.notnull(row['COMMUNITY']) else row['Geographic Cluster Name'],
axis=1
)
project_data.loc[project_data['Neighborhood'].isin(unmatched_geocluster_list),'Neighborhood']= None
project_data.loc[project_data['Geographic Cluster Name']=='online',['Neighborhood']]= 'online'
project_data=project_data.drop(columns=['COMMUNITY'])Matches: {'FOREST GLEN', 'SOUTH DEERING', 'WEST ENGLEWOOD', 'CHATHAM', 'AVONDALE', 'GRAND BOULEVARD', 'WASHINGTON HEIGHTS', 'AVALON PARK', 'RIVERDALE', 'HEGEWISCH', 'HUMBOLDT PARK', 'WEST GARFIELD PARK', 'ARCHER HEIGHTS', 'NORTH CENTER', 'AUBURN GRESHAM', 'UPTOWN', 'SOUTH SHORE', 'EDGEWATER', 'HYDE PARK', 'LOGAN SQUARE', 'BRIGHTON PARK', 'ENGLEWOOD', 'DUNNING', 'ALBANY PARK', 'NORTH PARK', 'WEST RIDGE', 'CLEARING', 'WEST ELSDON', 'FULLER PARK', 'GAGE PARK', 'EAST GARFIELD PARK', 'CHICAGO LAWN', 'MONTCLARE', 'ASHBURN', 'GREATER GRAND CROSSING', 'IRVING PARK', 'NEAR SOUTH SIDE', 'WEST PULLMAN', 'CALUMET HEIGHTS', 'JEFFERSON PARK', 'NEAR WEST SIDE', 'MOUNT GREENWOOD', 'LINCOLN SQUARE', 'NEW CITY', 'ROSELAND', 'EDISON PARK', 'DOUGLAS', 'GARFIELD RIDGE', 'WEST TOWN', 'BRIDGEPORT', 'BURNSIDE', 'MCKINLEY PARK', 'LAKE VIEW', 'WEST LAWN', 'KENWOOD', 'OHARE', 'NORWOOD PARK', 'ROGERS PARK', 'ARMOUR SQUARE', 'PULLMAN', 'WOODLAWN', 'LOWER WEST SIDE', 'HERMOSA', 'BEVERLY', 'EAST SIDE', 'LINCOLN PARK', 'MORGAN PARK', 'AUSTIN', 'NORTH LAWNDALE', 'WASHINGTON PARK', 'LOOP', 'BELMONT CRAGIN', 'OAKLAND', 'SOUTH LAWNDALE', 'PORTAGE PARK', 'NEAR NORTH SIDE', 'SOUTH CHICAGO'}
Unmatched in project_data: {'North/Central Equity Zone', 'Back of the Yards', 'Greater Englewood', 'Bronzeville/South Lakefront', 'Garfield Park', 'Near South Equity Zone', 'Greater Roseland', 'Englewood Block Group 1', 'Northwest Equity Zone', 'online', 'Southwest Equity Zone', 'Little Village', nan, 'Far South Equity Zone', 'West Equity Zone'}
Unmatched in chi_nei: set()
Geographic Cluster Name
Back of the Yards [NEW CITY, NEAR WEST SIDE, NEAR SOUTH SIDE, WE...
Bronzeville/South Lakefront [KENWOOD, HYDE PARK, WOODLAWN, WASHINGTON PARK...
Englewood Block Group 1 [ENGLEWOOD]
Far South Equity Zone [ROSELAND, MOUNT GREENWOOD, WEST PULLMAN, HEGE...
Garfield Park [WEST GARFIELD PARK, EAST GARFIELD PARK, LOOP,...
Greater Englewood [WEST ENGLEWOOD, ENGLEWOOD, CHICAGO LAWN]
Greater Roseland [WEST PULLMAN, ROSELAND, PULLMAN, NEAR SOUTH S...
Little Village [SOUTH LAWNDALE, NORTH LAWNDALE, IRVING PARK, ...
Near South Equity Zone [DOUGLAS, SOUTH SHORE, OAKLAND, FULLER PARK, K...
North/Central Equity Zone [LOOP, LINCOLN SQUARE, EDGEWATER, NEAR NORTH S...
Northwest Equity Zone [IRVING PARK, BELMONT CRAGIN, NORWOOD PARK, LO...
Southwest Equity Zone [BRIGHTON PARK, GAGE PARK, GARFIELD RIDGE, ASH...
West Equity Zone [HUMBOLDT PARK, AUSTIN, SOUTH LAWNDALE, NEAR W...
online [WEST PULLMAN, LINCOLN PARK, AVONDALE, EDGEWAT...
Name: COMMUNITY, dtype: object
# Creating useable df for bus data (extracting Latitude and Longitude, only keeping lat, long, and stop name)
bus_lat_long = bus_data_full['the_geom'].str.slice(6).str.replace('(','').str.replace(')','').str.split(' ')
bus_data_full['Longitude'] = pd.to_numeric(bus_lat_long.apply(lambda x: x[0]))
bus_data_full['Latitude'] = pd.to_numeric(bus_lat_long.apply(lambda x: x[1]))
bus_data = bus_data_full.loc[:, ['PUBLIC_NAM', 'Longitude', 'Latitude']]
# getting lat, long, train line, and station name
train_lat_long = train_data_full['Location'].str.replace('(','').str.replace(')','').str.split(',')
train_data_full['Longitude'] = pd.to_numeric(train_lat_long.apply(lambda x: x[1]))
train_data_full['Latitude'] = pd.to_numeric(train_lat_long.apply(lambda x: x[0]))
train_data_full['Line'] = train_data_full.apply(lambda x: 'Red' if x['RED'] == True else 'Blue' if x['BLUE'] == True else 'Green' if x['G'] == True
else 'Brown' if x['BRN'] == True else 'Purple' if x['P'] == True else 'Yellow' if x['Y'] == True
else 'Pink' if x['Pnk'] == True else 'Orange', axis = 1)
train_data = train_data_full.loc[:, ['STATION_DESCRIPTIVE_NAME', 'Longitude', 'Latitude', 'Line']]
# cleaning Participants Paid to True or False for better graphing, only keeping used columns (and program ID for identificiation while coding)
project_data = project_data_full.loc[:,['Program ID', 'Meeting Type', 'Scholarship Available', 'Participants Paid', 'Transport Provided', 'Has Free Food', 'Latitude', 'Longitude']]
project_data['Participants Paid'] = project_data['Participants Paid'].apply(lambda x: True if x == 'Paid, Type Unknown' else False if x == 'Not Paid' else np.nan)
# making subdataset to use for face-to-face data (must have location data)
project_data_f2f = project_data[~project_data_full['Location'].isna()]
project_data_f2f = project_data_f2f[project_data_f2f['Meeting Type'] == 'face_to_face']
project_data_f2f.drop_duplicates(subset='Program ID', inplace=True)1.3 Data preparation
By all members
For exploration 1, the dataset does not need to be further prepared past imputation.
For exploration 2:
# #Creating three socioeconomic status bins
chi_ses = pd.read_csv('Census_Data_-_Selected_socioeconomic_indicators_in_Chicago__2008___2012.csv')
chi_ses['ses_bin'] = pd.cut(chi_ses['HARDSHIP INDEX'],3,labels=['High-SES','Mid-SES','Low-SES'])
chi_ses=chi_ses[chi_ses['COMMUNITY AREA NAME']!='CHICAGO']#drop chicago total measure
#Creating Start Year Variable from Start Date
project_data['Start Date']=pd.to_datetime(project_data['Start Date'],format='%m/%d/%Y')
project_data['Start Year']=project_data['Start Date'].dt.yearFor exploration 3:
#average temp of each month based on avg monthly temp between (2000-2023)
#i left out 2024 because it is incomplete (we are still in 2024)
temp_data = temp_data.iloc[:-1, :]
temp_data = temp_data.apply(pd.to_numeric, errors='coerce')
month_temps = temp_data.iloc[:, 1:-1].mean(axis=0)
print(month_temps)Jan 24.975000
Feb 27.150000
Mar 38.858333
Apr 49.408333
May 60.175000
Jun 70.266667
Jul 74.862500
Aug 73.645833
Sep 66.558333
Oct 53.783333
Nov 41.237500
Dec 30.029167
dtype: float64
# changing start date and end date to datetime format and extract their months as new columns (assignment D)
start_date = pd.to_datetime(mcmf_project_data['Start Date'])
end_date = pd.to_datetime(mcmf_project_data['End Date'])
mcmf_project_data['start_month'] = start_date.dt.month
mcmf_project_data['end_month'] = end_date.dt.month# turn max-min age into a range and start_month and end_month into range
# then exploding the column so that each age that participates in the program will be acounted for
mcmf_project_data= mcmf_project_data[mcmf_project_data['Min Age'] <= 25]
mcmf_project_data['age_range'] = mcmf_project_data.apply(lambda row: list(range(row['Min Age'], row['Max Age'] + 1)), axis=1)
mcmf_project_data['month_range'] = mcmf_project_data.apply(lambda row: list(range(row['start_month'], row['end_month'] + 1)), axis=1)
project_data_exploded = mcmf_project_data.explode('month_range').explode('age_range')
#binning ages in age groups
age_bins = [mcmf_project_data['Min Age'].min(), 5, 10, 14, 18, 25]
labels = ['0-5', '6-10', '11-14', '15-18', '19-25']
project_data_exploded['age_range_binned'] = pd.cut(project_data_exploded['age_range'], bins=age_bins, labels=labels, right=False)
#binning months into seasons
month_bins = [0, 2, 5, 8, 11, 13]
season_labels = ['Winter', 'Spring', 'Summer', 'Autumn', 'Winter']
project_data_exploded['Seasons'] = pd.cut(project_data_exploded['month_range'], bins=month_bins, labels=season_labels, right=False, ordered=False)# mapping temp data to MCMF data
month_num_to_name = {
1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun',
7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'}
project_data_exploded['month_name'] = project_data_exploded['month_range'].map(month_num_to_name)
project_data_exploded['avg_temperature'] = project_data_exploded['month_name'].map(month_temps)
project_data_considered = project_data_exploded[project_data_exploded['age_range_binned'] != '25+']
project_data_considered = project_data_considered.reset_index()#binning temperatures
temp_bins = [0, 32, 45, 55, 65, project_data_considered['avg_temperature'].max()]
temp_labels = ['Below Freezing', '33-45° F', '46-55° F', '56-65° F', '66+° F']
project_data_considered['temp_binned'] = pd.cut(project_data_considered['avg_temperature'], bins=temp_bins, labels=temp_labels, right=False)For exploration 4:
radius = 3957.297 # estimation of the Earth's radius at Chicago's Latitude in miles
radius *= 5280 # radius estimate converted to feet
bus_long = np.deg2rad(bus_data['Longitude']) # preparing bus stop data for haversine
bus_lat = np.deg2rad(bus_data['Latitude'])
def dist_to_closest_bus_station(lat, long): # finds minimum haversine distance between program location and a bus stop
program_long = np.deg2rad(long)
program_lat = np.deg2rad(lat)
diff_long = bus_long - program_long
diff_lat = bus_lat- program_lat
add = np.cos(program_lat) * np.cos(bus_lat) * np.sin(diff_long * 0.5) ** 2
d = np.sin(diff_lat * 0.5) ** 2 + add
h = 2 * radius * np.arcsin(np.sqrt(d))
return(h.min())
train_long = np.deg2rad(train_data['Longitude']) # prepares train stop data
train_lat = np.deg2rad(train_data['Latitude'])
def dist_to_closest_train_station(lat, long): # finds distance to nearest train station in feet using haversine
program_long = np.deg2rad(long)
program_lat = np.deg2rad(lat)
diff_long = train_long - program_long
diff_lat = train_lat- program_lat
add = np.cos(program_lat) * np.cos(train_lat) * np.sin(diff_long * 0.5) ** 2
d = np.sin(diff_lat * 0.5) ** 2 + add
h = 2 * radius * np.arcsin(np.sqrt(d))
idx = h.idxmin()
return(pd.Series([h.min(), train_data['Line'][idx]]))project_data_f2f['Distance to Closest Bus Station (ft)'] = project_data_f2f.apply(lambda row: dist_to_closest_bus_station(row.Latitude, row.Longitude), axis = 1)
project_data_f2f[['Distance to Closest Train Station (ft)', 'Closest Train Line']] = project_data_f2f.apply(lambda row: dist_to_closest_train_station(row.Latitude, row.Longitude), axis = 1)project_data_f2f.sort_values('Distance to Closest Bus Station (ft)', ascending=False).head(8)| Program ID | Meeting Type | Scholarship Available | Participants Paid | Transport Provided | Has Free Food | Latitude | Longitude | Distance to Closest Bus Station (ft) | Distance to Closest Train Station (ft) | Closest Train Line | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 372622 | 94624 | face_to_face | False | False | False | False | 40.015900 | -120.961998 | 9.085801e+06 | 9.070352e+06 | Blue |
| 370121 | 94583 | face_to_face | False | False | False | False | 40.015900 | -120.961998 | 9.085801e+06 | 9.070352e+06 | Blue |
| 344316 | 175001 | face_to_face | False | NaN | NaN | NaN | 38.922466 | -94.731972 | 2.163517e+06 | 2.197046e+06 | Blue |
| 373432 | 121052 | face_to_face | True | False | False | True | 39.482399 | -88.172798 | 8.069517e+05 | 8.308609e+05 | Red |
| 370593 | 137468 | face_to_face | False | NaN | NaN | NaN | 41.758400 | -88.317200 | 1.178404e+05 | 1.377383e+05 | Blue |
| 370932 | 144406 | face_to_face | False | NaN | NaN | NaN | 42.035400 | -88.230698 | 1.062665e+05 | 9.093925e+04 | Blue |
| 91751 | 202814 | face_to_face | False | True | NaN | False | 42.126797 | -88.076065 | 7.858434e+04 | 7.157509e+04 | Blue |
| 370085 | 135987 | face_to_face | False | NaN | NaN | NaN | 41.866299 | -88.105202 | 7.391032e+04 | 6.799511e+04 | Blue |
The first three places are not in Illinois and therefore make sense to remove. The fourth greatest distance to closest bus station entry is in Charleston IL (south of Springfield) and also makes sense to remove. The Fifth is in Aurora, which I think is close enough to justify its inclusion (which also means every subsequent program is closer and more reasonable to include)
project_data_f2f = project_data_f2f.sort_values('Distance to Closest Bus Station (ft)', ascending=False)
project_data_f2f = project_data_f2f.copy().iloc[4:]
# removing the furthest four programs as their coordinates put them outside a reasonable bound of Chicago2 Exploratory data analysis
2.1 Analysis 1
By Gina Valdivia
/opt/anaconda3/lib/python3.11/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
grouped_vals = vals.groupby(grouper)
# Plotting Hardship Index vs Program Count with a line of best fit including confidence interval
plt.figure(figsize=(10, 6))
sns.regplot(
data=density_income_df,
x='HARDSHIP INDEX',
y='Hardship Program Count',
ci=95, # Confidence interval level, default is 95%
scatter_kws={'color': 'skyblue', 'alpha': 0.7},
line_kws={'color': 'red', 'linestyle': '--'}
)
plt.xlabel('Hardship Index')
plt.ylabel('Program Count')
plt.title('Program Count vs Hardship Index with Line of Best Fit and Confidence Interval')
plt.grid(True)
plt.tight_layout()
plt.show()
plt.figure(figsize=(14, 8))
sns.barplot(
data=category_ses_counts,
x='ses_bin',
y='Program Count',
hue='Category Name',
palette=sns.color_palette('tab20', n_colors=len(category_ses_counts['Category Name'].unique()))
)
plt.xlabel('Socioeconomic Status (SES) Bin')
plt.ylabel('Number of Programs')
plt.title('Number of Programs by Category and Socioeconomic Status')
plt.xticks(rotation=45)
plt.legend(title='Program Category', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(axis='y')
plt.tight_layout()
plt.show()/opt/anaconda3/lib/python3.11/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
grouped_vals = vals.groupby(grouper)
/opt/anaconda3/lib/python3.11/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
grouped_vals = vals.groupby(grouper)
# Plotting the number of programs by price category and SES level
plt.figure(figsize=(14, 8))
sns.barplot(
data=price_ses_counts,
x='ses_bin',
y='Count',
hue='Program Price',
palette='Set2'
)
plt.xlabel('Socioeconomic Status (SES) Bin')
plt.ylabel('Number of Programs')
plt.title('Number of Programs by Price Category and Socioeconomic Status')
plt.xticks(rotation=45)
plt.legend(title='Program Price', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(axis='y')
plt.tight_layout()
plt.show()/opt/anaconda3/lib/python3.11/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
grouped_vals = vals.groupby(grouper)
/opt/anaconda3/lib/python3.11/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
grouped_vals = vals.groupby(grouper)
max_programs = program_density.loc[program_density['Hardship Program Count'].idxmax()]
print("Neighborhood with the highest number of programs:")
print(max_programs)Neighborhood with the highest number of programs:
Geographic Cluster Name IRVING PARK
Hardship Program Count 28641
Name: 34, dtype: object
# Plotting the number of programs with scholarships available by SES level
plt.figure(figsize=(10, 6))
sns.barplot(
data=scholarship_ses_counts,
x='ses_bin',
y='Scholarship Program Count',
palette='Set2'
)
plt.xlabel('Socioeconomic Status (SES) Bin')
plt.ylabel('Number of Programs with Scholarships Available')
plt.title('Number of Programs with Scholarships by Socioeconomic Status')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.tight_layout()
plt.show()/opt/anaconda3/lib/python3.11/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
grouped_vals = vals.groupby(grouper)
# Plotting the number of programs with participants paid by SES level
plt.figure(figsize=(10, 6))
sns.barplot(
data=participants_paid_ses_counts,
x='ses_bin',
y='Participants Paid Count',
palette='Set2'
)
plt.xlabel('Socioeconomic Status (SES) Bin')
plt.ylabel('Number of Programs with Participants Paid')
plt.title('Number of Programs with Participants Paid by Socioeconomic Status')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.tight_layout()
plt.show()/opt/anaconda3/lib/python3.11/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
grouped_vals = vals.groupby(grouper)
# Plotting the number of programs with transport provided by SES level
plt.figure(figsize=(10, 6))
sns.barplot(
data=transport_ses_counts,
x='ses_bin',
y='Transport Provided Count',
palette='Set2'
)
plt.xlabel('Socioeconomic Status (SES) Bin')
plt.ylabel('Number of Programs with Transport Provided')
plt.title('Number of Programs with Transport Provided by Socioeconomic Status')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.tight_layout()
plt.show()/opt/anaconda3/lib/python3.11/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
grouped_vals = vals.groupby(grouper)
#Plotting the number of programs with free food by SES level
plt.figure(figsize=(10, 6))
sns.barplot(
data=free_food_ses_counts,
x='ses_bin',
y='Free Food Count',
palette='Set2'
)
plt.xlabel('Socioeconomic Status (SES) Bin')
plt.ylabel('Number of Programs with Free Food')
plt.title('Number of Programs with Free Food by Socioeconomic Status')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.tight_layout()
plt.show()/opt/anaconda3/lib/python3.11/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
grouped_vals = vals.groupby(grouper)
2.2 Analysis 2
By Luna Xu
# merge census dataset with MCMF dataset
project_ses=pd.merge(project_data,chi_ses,left_on='Neighborhood',right_on='COMMUNITY AREA NAME',how='left')
# filter MCMF in-person programs with neighborhood info
project_hasnei_ses=project_ses.loc[project_ses['Neighborhood'].notnull() & (project_ses['Neighborhood']!='online')]
# merge census dataset with neighborhood boundary dataset
chi_ses['COMMUNITY AREA NAME']=chi_ses['COMMUNITY AREA NAME'].astype(str).str.upper()
chi_nei_ses=pd.merge(chi_nei,chi_ses,left_on='COMMUNITY',right_on='COMMUNITY AREA NAME')
# create geopandas dataset
chi_nei_ses_gdf=gpd.GeoDataFrame(chi_nei_ses,geometry='shapely_geom')
# count number of distinct programs by neighborhood
program_counts = project_hasnei_ses.groupby('Neighborhood')['Program ID'].nunique().reset_index()
program_counts['Program Count']=program_counts['Program ID']
program_counts['Program Count']=program_counts['Program Count'].fillna(0)
# merge program count with neighborhood geographic & ses information
program_count_nei=pd.merge(program_counts,chi_nei_ses_gdf,left_on='Neighborhood',right_on='COMMUNITY')
# create geopandas dataset of the previous dataset
program_count_nei_gdf=gpd.GeoDataFrame(program_count_nei,geometry='shapely_geom')
# slice dataset by ses
lowses=program_count_nei_gdf.loc[program_count_nei_gdf['ses_bin']=='Low-SES',:]
midses=program_count_nei_gdf.loc[program_count_nei_gdf['ses_bin']=='Mid-SES',:]
highses=program_count_nei_gdf.loc[program_count_nei_gdf['ses_bin']=='High-SES',:]
#geospatial plotting
fig, axes = plt.subplots(1, 3, figsize=(18, 6), constrained_layout=True)
# Plot for low SES neighborhood
lowses.plot(ax=axes[0], column='Program Count', cmap='YlGnBu',
norm=LogNorm(vmin=program_count_nei_gdf['Program Count'].min(),
vmax=program_count_nei_gdf['Program Count'].max()),
alpha=0.7)
for idx, row in lowses.iterrows():
centroid = row['shapely_geom'].centroid
axes[0].annotate(text=int(row['Program Count']),
xy=(centroid.x, centroid.y),
fontsize=7, ha='center', color='black', fontweight='bold')
axes[0].set_title('Low-SES Neighborhood')
# Plot for mid SES neighborhood
midses.plot(ax=axes[1], column='Program Count', cmap='YlGnBu',
norm=LogNorm(vmin=program_count_nei_gdf['Program Count'].min(),
vmax=program_count_nei_gdf['Program Count'].max()),
alpha=0.7)
for idx, row in midses.iterrows():
centroid = row['shapely_geom'].centroid
axes[1].annotate(text=int(row['Program Count']),
xy=(centroid.x, centroid.y),
fontsize=7, ha='center', color='black', fontweight='bold')
axes[1].set_title('Mid-SES Neighborhood')
# Plot for high SES neighborhood
highses.plot(ax=axes[2], column='Program Count', cmap='YlGnBu',
norm=LogNorm(vmin=program_count_nei_gdf['Program Count'].min(),
vmax=program_count_nei_gdf['Program Count'].max()),
alpha=0.7)
for idx, row in highses.iterrows():
centroid = row['shapely_geom'].centroid
axes[2].annotate(text=int(row['Program Count']),
xy=(centroid.x, centroid.y),
fontsize=7, ha='center', color='black', fontweight='bold')
axes[2].set_title('High-SES Neighborhood')
fig.suptitle('Program Distribution Based On Neighborhood SES', fontsize=16)
plt.show()
# merge MCMF dataset with neighborhood ses
project_by_year_by_ses=pd.merge(project_data,chi_ses,left_on="Neighborhood",right_on="COMMUNITY AREA NAME")
# exclude 2025 data
project_by_year_by_ses=project_by_year_by_ses[project_by_year_by_ses['Start Year']!=2025]
# count distinct programs based on ses & year
year_nei_programcount=project_by_year_by_ses.groupby(['Start Year','ses_bin'])['Program ID'].nunique().reset_index()
year_nei_programcount=year_nei_programcount.rename(columns={'Program ID':'Program Count'})
pivotdf=year_nei_programcount.pivot(index='Start Year',columns='ses_bin',values='Program Count')
# create lineplot of program coun by neighborhood ses over time
plt.figure(figsize=(10,5))
for ses in pivotdf.columns:
plt.plot(pivotdf.index,pivotdf[ses], label=ses)
for year,count in zip(pivotdf.index,pivotdf[ses]):
if year == 2020:
offset = 500 * (pivotdf.columns.tolist().index(ses) + 1)
plt.text(year, count + offset, str(count), fontsize=8, ha='center', va='top')
else:
plt.text(year, count, str(count), fontsize=8, ha='center', va='bottom')
plt.title("Program Count By Neighborhood SES Over Time")
plt.xlabel("Year")
plt.ylabel("Total Number Program")
plt.xticks(ticks=pivotdf.index)
plt.legend(title="Neighborhood SES")
plt.show()/var/folders/d0/lnkwf311481bj1nhh21x9k3m0000gn/T/ipykernel_3244/2903163378.py:7: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
year_nei_programcount=project_by_year_by_ses.groupby(['Start Year','ses_bin'])['Program ID'].nunique().reset_index()
# count distinct programs by neighborhood and year
pivot_distro_data = (
project_by_year_by_ses
.groupby(['Start Year', 'Neighborhood'])['Program ID']
.nunique()
.reset_index()
.rename(columns={'Program ID':'Program Count'})
)
pivot_distro_data = pivot_distro_data.merge(
project_by_year_by_ses[['Start Year', 'Neighborhood', 'ses_bin']].drop_duplicates(),
on=['Start Year', 'Neighborhood'],
how='left'
)
# create side-by-side boxplot
sns.boxplot(pivot_distro_data,x='Start Year',y='Program Count',hue='ses_bin').set_title('A Comparsion of the Distribution of Programs among Neighborhood By SES')
plt.show()
# show the top 10 neighborhood with top program count
# pivot_distro_data.sort_values(by='Program Count',ascending=False).head(10)
# definition a function to create heatmap based on ses and year
def generate_heatmap(data, title, label, col_label, row_label, ax):
heatmap_data = data.pivot(index='ses_bin', columns='Start Year', values='Offers').fillna(0)
sns.heatmap(heatmap_data, annot=True, fmt=".0f", cmap="YlOrBr", cbar_kws={'label': label}, ax=ax)
ax.set_title(title)
ax.set_xlabel(col_label)
ax.set_ylabel(row_label)
fig, axes = plt.subplots(1,3, figsize=(14.5, 3.5))
# Scholarship Heatmap
scholarship_offers_by_neises = (
project_by_year_by_ses[project_by_year_by_ses['Scholarship Available'] == True]
.groupby(['ses_bin', 'Start Year'])
.agg(Offers=('Program ID', 'nunique'))
.reset_index()
)
generate_heatmap(scholarship_offers_by_neises, "Scholarship Programs by Neighborhood-SES and Year",
"Number of Scholarships Programs", "Year", "Neighborhood", axes[0])
# Free Food Heatmap
freefood_by_neises = (
project_by_year_by_ses[project_by_year_by_ses['Has Free Food'] == True]
.groupby(['ses_bin', 'Start Year'])
.agg(Offers=('Program ID', 'nunique'))
.reset_index()
)
generate_heatmap(freefood_by_neises, "Free Food Programs by Neighborhood-SES and Year",
"Number of Free Food Programs", "Year", "Neighborhood", axes[1])
# Paid Programs Heatmap
paid_offers_by_neises = (
project_by_year_by_ses[project_by_year_by_ses['Participants Paid'] == 'Paid, Type Unknown']
.groupby(['ses_bin', 'Start Year'])
.agg(Offers=('Program ID', 'nunique'))
.reset_index()
)
# deal with no paid program 2020
fillna2020 = pd.DataFrame({
'ses_bin': ['High-SES', 'Mid-SES', 'Low-SES'],
'Start Year': [2020, 2020, 2020],
'Offers': [0, 0, 0]
})
paid_offers_by_neises = pd.concat([paid_offers_by_neises, fillna2020], ignore_index=True)
ses_order = ['High-SES', 'Mid-SES', 'Low-SES']
paid_offers_by_neises['ses_bin'] = pd.Categorical(paid_offers_by_neises['ses_bin'], categories=ses_order, ordered=True)
paid_offers_by_neises=paid_offers_by_neises.sort_values(by=['ses_bin', 'Start Year']).reset_index(drop=True)
generate_heatmap(paid_offers_by_neises, "Paid Programs by Neighborhood-SES and Year",
"Number of Paid Programs", "Year", "Neighborhood SES", axes[2])
plt.tight_layout()
plt.show()/var/folders/d0/lnkwf311481bj1nhh21x9k3m0000gn/T/ipykernel_3244/3596593404.py:13: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
.groupby(['ses_bin', 'Start Year'])
/var/folders/d0/lnkwf311481bj1nhh21x9k3m0000gn/T/ipykernel_3244/3596593404.py:22: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
.groupby(['ses_bin', 'Start Year'])
/var/folders/d0/lnkwf311481bj1nhh21x9k3m0000gn/T/ipykernel_3244/3596593404.py:31: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
.groupby(['ses_bin', 'Start Year'])
# group program categories
project_by_year_by_ses['Category Group']=project_by_year_by_ses['Category Name'].map(
{
'Work + Career':'Career & Life Skills',
'Managing Money.':'Career & Life Skills',
'Reading & Writing.':'STEM & Writing',
'Music & Art.':'Arts & Humanity',
'Sports + Wellness.':'Sports & Wellbeing',
'Science':'STEM & Writing',
'Food.':'Sports & Wellbeing',
'Academic Support':'STEM & Writing',
'Digital Media.':'Arts & Humanity',
'Performance.':'Arts & Humanity',
'Healthcare':'Sports & Wellbeing',
'Social Studies':'Arts & Humanity',
'Computers.':'STEM & Writing',
'Math':'STEM & Writing',
'Helping Your Community.':'Arts & Humanity',
'Building & Fixing Things':'Career & Life Skills',
'Nature.':'Sports & Wellbeing',
'Teaching':'Career & Life Skills',
'Customer/Human Service':'Career & Life Skills',
'Transportation':'Career & Life Skills',
'Law':'Career & Life Skills'
}
)
# count distinct programs based on ses and category
ses_category_programcount = project_by_year_by_ses[project_by_year_by_ses['Scholarship Available']==True].loc[:,['Program ID','Category Group','ses_bin']].groupby(
['ses_bin','Category Group']
).agg(
NumofProgram=('Program ID', 'nunique')
).reset_index().sort_values(by=['ses_bin', 'NumofProgram'], ascending=[True, False])
# side-by-side barplot of scholarship program category distribution by SES
sns.barplot(ses_category_programcount,x='ses_bin',y='NumofProgram',hue='Category Group').set_title('Scholarship Program Category Distribution by Neighborhood SES')
plt.legend(title='Program Category Group', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()/var/folders/d0/lnkwf311481bj1nhh21x9k3m0000gn/T/ipykernel_3244/858058547.py:29: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
ses_category_programcount = project_by_year_by_ses[project_by_year_by_ses['Scholarship Available']==True].loc[:,['Program ID','Category Group','ses_bin']].groupby(
# count freefood programs by neighborhood
freefood_by_nei = (
project_by_year_by_ses[project_by_year_by_ses['Has Free Food'] == True]
.groupby(['Neighborhood'])
.agg(Offers=('Program ID', 'nunique'))
.reset_index()
)
# merge the previous dataset to get hardship index for each neighborhood
freefood_by_nei=freefood_by_nei.merge(project_by_year_by_ses[['HARDSHIP INDEX','ses_bin','Neighborhood']].drop_duplicates(),on='Neighborhood')
# plot scatter with trendline
sns.regplot(freefood_by_nei,x='HARDSHIP INDEX',y='Offers')
plt.xlabel('Neighborhood Hardship Index')
plt.ylabel('Number of Free Food Programs')
plt.title('The Relationship Between Neighborhood Hardship Level and Number of Free Food Programs Offered')
plt.show()
# find the top and bottom neighborhood with free food programs
# freefood_by_nei.sort_values(by='Offers',ascending=False).head(10)
# freefood_by_nei.sort_values(by='Offers',ascending=False).tail(10)
# get all paid programs
paid_programs=project_by_year_by_ses[project_by_year_by_ses['Participants Paid'] == 'Paid, Type Unknown'].loc[:,['Program ID','Description','Org Name']]
paid_programs=paid_programs.set_index('Program ID').drop_duplicates()
# download relevant nltk packages
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
stop_words = set(stopwords.words('english'))
# create a function that clean and slice descriptions by word
def description_cleaner(description):
cleaned_description = re.sub(r'<.*?>', '', description)
words = word_tokenize(cleaned_description.lower())
return [word for word in words if word not in stop_words and word.isalpha()]
# create a column that store clean description (list of words)
paid_programs['Cleaned_Description']=paid_programs['Description'].apply(description_cleaner)
# get all words from all descriptions
all_words = [word for description in paid_programs['Cleaned_Description'] for word in description]
# count word frequency
word_counts = Counter(all_words)
# Get the most common words
common_words = word_counts.most_common(20)
# plot a bar graph to show word frequency
word, frequency = zip(*common_words)
plt.figure(figsize=(6, 4))
plt.bar(word, frequency, color='skyblue')
plt.title('Word Frequencies in Paid Program Description', fontsize=16)
plt.xlabel('Words', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.xticks(rotation=45, ha='right', fontsize=10)
plt.tight_layout()
plt.show()[nltk_data] Downloading package stopwords to
[nltk_data] /Users/dxchannel/nltk_data...
[nltk_data] Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/dxchannel/nltk_data...
[nltk_data] Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data] /Users/dxchannel/nltk_data...
[nltk_data] Package punkt_tab is already up-to-date!
# create a function that slice description into sentences
def sentence_seperator(description,word):
cleaned_description = re.sub(r'<.*?>', '', description)
sentences = re.split(r'(?<=[.!?])', cleaned_description)
return [sentence for sentence in sentences if word.lower() in sentence.lower()]
# find the sentences in descriptions that contain the word "paid"
paidword_program=project_by_year_by_ses[project_by_year_by_ses['Description'].str.contains('paid', case=False, na=False)]
paidword_program=paidword_program.loc[:,['Program ID','Program Name', 'Description','COMMUNITY AREA NAME','Scholarship Available','Participants Paid']]
paidword_program=paidword_program.set_index('Program ID').drop_duplicates()
not_labeled_paidword_program=paidword_program[(paidword_program['Participants Paid'] != 'Paid, Type Unknown') & (paidword_program['Scholarship Available'] == False)]
not_labeled_paidword_program.loc[:,['Matching Sentences']] = not_labeled_paidword_program['Description'].apply(lambda desc: sentence_seperator(desc,'paid'))
# display five samples of sentence that contains "paid" to inspect whether it actually reflects that the program pays participants
pd.set_option('display.max_colwidth', None)
not_labeled_paidword_program.loc[:,['Matching Sentences']].sample(5)
# find the sentences in descriptions that contain the word "stipend"
stipend_program=project_by_year_by_ses[project_by_year_by_ses['Description'].str.contains('stipend', case=False, na=False)]
stipend_program=stipend_program.loc[:,['Program ID','Program Name', 'Description','COMMUNITY AREA NAME','Scholarship Available','Participants Paid']]
stipend_program=stipend_program.set_index('Program ID').drop_duplicates()
not_labeled_stipend_program=stipend_program[(stipend_program['Participants Paid'] != 'Paid, Type Unknown') & (stipend_program['Scholarship Available'] == False)]
not_labeled_stipend_program.loc[:,['Matching Sentences']] = not_labeled_stipend_program['Description'].apply(lambda desc: sentence_seperator(desc, 'stipend'))
# display five samples of sentence that contains "stipend" to inspect whether it actually reflects that the program pays participants
not_labeled_stipend_program.loc[:,['Matching Sentences']].sample(5)
# create a column called stipend and impute it with true if has matching sentence
project_by_year_by_ses.loc[
project_by_year_by_ses['Program ID'].isin(not_labeled_stipend_program.index),
'Stipend'
] = True
# count distinct paid programs based on ses and start year
new_paid_offers_by_neises = (
project_by_year_by_ses[(project_by_year_by_ses['Participants Paid'] == 'Paid, Type Unknown') | (project_by_year_by_ses['Stipend']==True)]
.groupby(['ses_bin', 'Start Year'])
.agg(Offers=('Program ID', 'nunique'))
.reset_index()
)
new_paid_offers_by_neises = pd.concat([new_paid_offers_by_neises, fillna2020], ignore_index=True)
new_paid_offers_by_neises['ses_bin'] = pd.Categorical(new_paid_offers_by_neises['ses_bin'], categories=ses_order, ordered=True)
new_paid_offers_by_neises=new_paid_offers_by_neises.sort_values(by=['ses_bin', 'Start Year']).reset_index(drop=True)
# plot two heatmap to compare the result before and after imputing stipend programs
fig, axes = plt.subplots(1, 2, figsize=(12, 3))
generate_heatmap(paid_offers_by_neises, "Paid Programs by Neighborhood-SES and Year",
"Number of Paid Programs", "Year", "Neighborhood SES", axes[0])
generate_heatmap(new_paid_offers_by_neises, "Stipend & Paid Programs by Neighborhood-SES and Year",
"Number of Paid Programs", "Year", "Neighborhood SES", axes[1])
plt.tight_layout()
plt.show()/var/folders/d0/lnkwf311481bj1nhh21x9k3m0000gn/T/ipykernel_3244/1407071438.py:32: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
.groupby(['ses_bin', 'Start Year'])
2.3 Analysis 3
By Isabella Seo
#plotting Program distirbution for each month
sns.set_style("whitegrid")
sns.countplot(data=project_data_considered, x='month_name', order=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.xlabel('Month')
plt.ylabel('Count')
plt.title('How Many Programs are in Each Month')Text(0.5, 1.0, 'How Many Programs are in Each Month')
#heatmap of number of programs for each season and age
subset_data1 = pd.crosstab(project_data_considered['Seasons'], project_data_considered['age_range_binned'])
sns.heatmap(subset_data1, cmap='Blues')<Axes: xlabel='age_range_binned', ylabel='Seasons'>
#heatmap of number of programs for each temp bin and age
subset_data2 = pd.crosstab(project_data_considered['temp_binned'], project_data_considered['age_range_binned'])
sns.heatmap(subset_data2, cmap='Blues')<Axes: xlabel='age_range_binned', ylabel='temp_binned'>
# plotting categories by season
sns.countplot(data=project_data_considered, x='Seasons', hue='Category Name', order=['Winter', 'Spring', 'Summer', 'Autumn'], palette='tab20')
plt.legend(title='Categories', bbox_to_anchor=(1,1))
plt.ylabel('Count (in millions)')
plt.title('Category Count for each Season')Text(0.5, 1.0, 'Category Count for each Season')
#plotting categories by season and age
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
a = sns.FacetGrid(data=project_data_considered, col='age_range_binned', col_wrap=2, aspect=2, legend_out=True,sharex=False)
a.map_dataframe(sns.countplot, x='Seasons', hue='Category Name', order=['Winter', 'Spring', 'Summer', 'Autumn'], palette='tab20')
a.add_legend(title='Category Name', loc='upper left', bbox_to_anchor=(1, 1))
a.set_titles(col_template='Age Group: {col_name} yrs old')<seaborn.axisgrid.FacetGrid at 0x17fafeed0>
#plotting categories by temperatures
sns.countplot(data=project_data_considered, x='temp_binned', hue='Category Name', palette='tab20')
plt.legend(bbox_to_anchor=(1,1))
plt.xlabel('Temperatures')
plt.title('Category Count for Temperature')
plt.ylabel('Count (in millions)')Text(0, 0.5, 'Count (in millions)')
#plotting categories by temperatures and age
b = sns.FacetGrid(data=project_data_considered, col='age_range_binned', col_wrap=2, aspect=2, legend_out=True,sharex=False)
b.map_dataframe(sns.countplot, x='temp_binned', hue='Category Name', palette='tab20')
b.add_legend(title='Category Name', loc='upper left', bbox_to_anchor=(1, 1))
b.set_titles(col_template='Age Group: {col_name} yrs old')<seaborn.axisgrid.FacetGrid at 0x13bac6630>
#plotting program price by season
sns.countplot(data=project_data_considered, x='Seasons', hue='Program Price', order=['Winter', 'Spring', 'Summer', 'Autumn'], palette='tab20')
plt.title('Program Price by Season')
plt.ylabel('Count (in millions)')Text(0, 0.5, 'Count (in millions)')
#plotting program price by seasons and ages
c = sns.FacetGrid(data=project_data_considered, col='age_range_binned', col_wrap=2, aspect=2, legend_out=True,sharex=False)
c.map_dataframe(sns.countplot, x='Seasons', hue='Program Price', order=['Winter', 'Spring', 'Summer', 'Autumn'], palette='tab20')
c.add_legend(title='Program Price', loc='upper left', bbox_to_anchor=(1, 1))
c.set_titles(col_template='Age Group: {col_name} yrs old')<seaborn.axisgrid.FacetGrid at 0x5d39973e0>
#plotting program price by temperatures
sns.countplot(data= project_data_considered, x='temp_binned', hue='Program Price', palette='tab20')
plt.title('Program Price by Temperature')
plt.xlabel('Temperatures')
plt.ylabel('Count (in millions)')Text(0, 0.5, 'Count (in millions)')
#plotting program price by temperatures and age
d = sns.FacetGrid(data=project_data_considered, col='age_range_binned', col_wrap=2, aspect=2, legend_out=True,sharex=False)
d.map_dataframe(sns.countplot, x='temp_binned', hue='Program Price', palette='tab20')
d.add_legend(title='Program Price', loc='upper left', bbox_to_anchor=(1, 1))
d.set_titles(col_template='Age Group: {col_name} yrs old')<seaborn.axisgrid.FacetGrid at 0x60c498fb0>
2.4 Analysis 4
By Owen Handelman
# figure for comparing online and in-person programs in assistance provided
# comparing if the program offers food, transportation, and scholarships
fig, axs = plt.subplots(1,3,figsize=(16,7))
sns.barplot(project_data, y='Has Free Food', hue = 'Meeting Type', ax=axs[0])
sns.barplot(project_data, y='Participants Paid', hue = 'Meeting Type', ax=axs[1])
sns.barplot(project_data, y='Scholarship Available', hue = 'Meeting Type', ax=axs[2])
axs[1].get_legend().remove()
axs[2].get_legend().remove()
# creating plot of bus and train stations (train stations are color coded)
chicago = gpd.read_file(geodatasets.get_path('geoda.chicago_commpop'))
fig, axes = plt.subplots(1,2,figsize=(15,10))
chicago.boundary.plot(ax=axes[0])
axes[0].scatter(x= bus_data['Longitude'], y = bus_data['Latitude'], s = 3, alpha = 0.35, color ='black')
axes[0].set_title('CTA Bus Station Map')
chicago.boundary.plot(ax=axes[1])
train_color_dict = {'Pink': 'pink', 'Green':'green', 'Blue':'blue', 'Brown':'brown', 'Purple':'purple', 'Red':'red', 'Orange':'orange', 'Yellow':'yellow'}
axes[1].scatter(x= train_data['Longitude'], y = train_data['Latitude'], s = 15, c=[train_color_dict[cat] for cat in train_data['Line']], edgecolors='black')
axes[1].set_title('CTA Train Station Map')Text(0.5, 1.0, 'CTA Train Station Map')
# plot the location of programs that do and do not provide transport
f2f_transport_yes = project_data_f2f[project_data_f2f['Transport Provided'] == True]
f2f_transport_no = project_data_f2f[project_data_f2f['Transport Provided'] == False]
chicago = gpd.read_file(geodatasets.get_path('geoda.chicago_commpop'))
fig, axes = plt.subplots(1,2,figsize=(15,10))
chicago.boundary.plot(ax=axes[0])
axes[0].scatter(x= f2f_transport_no['Longitude'], y = f2f_transport_no['Latitude'], s = 5, alpha = 0.35, color ='black')
axes[0].set_title('Programs That Do Not Provide Transportation')
chicago.boundary.plot(ax=axes[1])
axes[1].scatter(x= f2f_transport_yes['Longitude'], y = f2f_transport_yes['Latitude'], s = 5, alpha=0.5, c='black')
axes[1].set_title('Programs That Do Provided Transportation')Text(0.5, 1.0, 'Programs That Do Provided Transportation')
# overlaying train station map and map of programs that offer transportation
chicago = gpd.read_file(geodatasets.get_path('geoda.chicago_commpop'))
fig, axes = plt.subplots(figsize=(15,10))
chicago.boundary.plot(ax=axes)
train_color_dict = {'Pink': 'pink', 'Green':'green', 'Blue':'blue', 'Brown':'brown', 'Purple':'purple', 'Red':'red', 'Orange':'orange', 'Yellow':'yellow'}
axes.scatter(x= train_data['Longitude'], y = train_data['Latitude'], s = 15, c=[train_color_dict[cat] for cat in train_data['Line']], edgecolors='black')
axes.scatter(x= f2f_transport_yes['Longitude'], y = f2f_transport_yes['Latitude'], s = 50, c='black', marker='X')
axes.set_title('Programs That Do Provided Transportation')Text(0.5, 1.0, 'Programs That Do Provided Transportation')
# figure showing the number of programs that are closest to each train line
fig, axs = plt.subplots(1,2,figsize=(15,6))
sns.countplot(project_data_f2f, x = 'Closest Train Line', ax= axs[0])
sns.countplot(project_data_f2f[project_data_f2f['Transport Provided'] == True], x = 'Closest Train Line', ax= axs[1])
axs[0].set_title('All Programs')
axs[1].set_title('Programs that provide Transport')Text(0.5, 1.0, 'Programs that provide Transport')
# comparing differences in the average distance to closest train station between programs that do and do not provide assistance
fig, axs = plt.subplots(2,2,figsize=(15,10))
sns.barplot(data=project_data_f2f, x='Has Free Food', y='Distance to Closest Train Station (ft)', ax=axs[0,0])
sns.barplot(data=project_data_f2f, x='Participants Paid', y='Distance to Closest Train Station (ft)', ax=axs[0,1])
sns.barplot(data=project_data_f2f, x='Transport Provided', y='Distance to Closest Train Station (ft)', ax=axs[1,0])
sns.barplot(data=project_data_f2f, x='Scholarship Available', y='Distance to Closest Train Station (ft)', ax=axs[1,1])<Axes: xlabel='Scholarship Available', ylabel='Distance to Closest Train Station (ft)'>